install.packages("copula")
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
rm(list=ls())
library(ggplot2)
library(tibble)
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(lubridate)
##
## Caricamento pacchetto: 'lubridate'
## I seguenti oggetti sono mascherati da 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
library(copula)
##
## Caricamento pacchetto: 'copula'
## Il seguente oggetto è mascherato da 'package:lubridate':
##
## interval
#Exercise 1
data <- read_csv("Global_Mobility_Report.csv")
## Rows: 11730025 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): country_region_code, country_region, sub_region_1, sub_region_2, m...
## dbl (6): retail_and_recreation_percent_change_from_baseline, grocery_and_ph...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 11,730,025 × 15
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 AE United Arab Emirates <NA> <NA> <NA>
## 2 AE United Arab Emirates <NA> <NA> <NA>
## 3 AE United Arab Emirates <NA> <NA> <NA>
## 4 AE United Arab Emirates <NA> <NA> <NA>
## 5 AE United Arab Emirates <NA> <NA> <NA>
## 6 AE United Arab Emirates <NA> <NA> <NA>
## 7 AE United Arab Emirates <NA> <NA> <NA>
## 8 AE United Arab Emirates <NA> <NA> <NA>
## 9 AE United Arab Emirates <NA> <NA> <NA>
## 10 AE United Arab Emirates <NA> <NA> <NA>
## # … with 11,730,015 more rows, and 10 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
I select the data that refers to Italy.
data_it<-data[data$country_region_code=="IT",]
data_it
## # A tibble: 130,939 × 15
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 IT Italy <NA> <NA> <NA>
## 2 IT Italy <NA> <NA> <NA>
## 3 IT Italy <NA> <NA> <NA>
## 4 IT Italy <NA> <NA> <NA>
## 5 IT Italy <NA> <NA> <NA>
## 6 IT Italy <NA> <NA> <NA>
## 7 IT Italy <NA> <NA> <NA>
## 8 IT Italy <NA> <NA> <NA>
## 9 IT Italy <NA> <NA> <NA>
## 10 IT Italy <NA> <NA> <NA>
## # … with 130,929 more rows, and 10 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
I filter the data with the country_region attribute different from Na.
data_it<-data_it[is.na(data_it$country_region_code)==FALSE,]
I exctract the week, the month and the year of each record and create the proper columns
data_it['week']<-week(ymd(data_it$date))
data_it['month']<-month(ymd(data_it$date))
data_it['year']<-year(ymd(data_it$date))
data_it
## # A tibble: 123,695 × 18
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 IT Italy <NA> <NA> <NA>
## 2 IT Italy <NA> <NA> <NA>
## 3 IT Italy <NA> <NA> <NA>
## 4 IT Italy <NA> <NA> <NA>
## 5 IT Italy <NA> <NA> <NA>
## 6 IT Italy <NA> <NA> <NA>
## 7 IT Italy <NA> <NA> <NA>
## 8 IT Italy <NA> <NA> <NA>
## 9 IT Italy <NA> <NA> <NA>
## 10 IT Italy <NA> <NA> <NA>
## # … with 123,685 more rows, and 13 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
I select the data that refers only to the Italy in total (not specifically to regions or cities)
data_it<-data_it[is.na(data_it$metro_area)==TRUE,]
data_it<-data_it[is.na(data_it$sub_region_2)==TRUE,]
data_it<-data_it[is.na(data_it$sub_region_1)==TRUE,]
Using the group by function I calculate the average per week of the attributes of the dataframe
week_df = data_it %>%
group_by(year,week) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
Printing the obtained dataframe
week_df
## # A tibble: 142 × 8
## # Groups: year, week [142]
## year week avg_retail avg_gp avg_parks avg_tran_st avg_workplace avg_res
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020 7 2.25 0.75 21 6.75 0.75 -0.75
## 2 2020 8 -1.86 6.86 20.4 -0.857 -3.71 1
## 3 2020 9 -11.4 -3.57 -8.43 -21.9 -9.57 4.71
## 4 2020 10 -22.6 0.286 -9.29 -31.4 -17.3 8.29
## 5 2020 11 -74.6 -35.1 -64.4 -71.6 -52.7 24.9
## 6 2020 12 -82.9 -50.9 -79.9 -80.3 -64.3 29.6
## 7 2020 13 -86 -53.9 -83 -82.3 -68.6 31.6
## 8 2020 14 -87.6 -46.4 -79.6 -79.9 -66.7 30.7
## 9 2020 15 -86.1 -52.3 -80.6 -81 -69.3 31
## 10 2020 16 -82.1 -47.4 -78.6 -77.9 -61.1 28.4
## # … with 132 more rows
Here i plot the results. For each quantity I use three plots, one for each year
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_retail
title<-paste("average retail and recreation percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="red")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='red')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_gp
title<-paste("average grocery and pharmacy percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="blue")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='blue')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-60, 40), breaks = seq(-60, 40, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_parks
title<-paste("average parks percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="green")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='green')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-100, 200), breaks = seq(-100, 200, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_tran_st
title<-paste("average transit stations percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="violet")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='violet')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_workplace
title<-paste("average workplaces percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="orange")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='orange')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-80, 0), breaks = seq(-80, 0, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_res
title<-paste("average residential percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="pink")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='pink')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-10, 40), breaks = seq(-10, 40, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
I repeat the previous analysis, but grouping by the months.
month_df = data_it %>%
group_by(year,month) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
month_df
## # A tibble: 33 × 8
## # Groups: year, month [33]
## year month avg_retail avg_gp avg_parks avg_tran_st avg_workplace avg_res
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020 2 -3 2.6 15.5 -4 -4.53 1.6
## 2 2020 3 -61.3 -31.9 -55.5 -62.3 -46.5 21.7
## 3 2020 4 -83.4 -47.6 -77.7 -78.3 -63.9 29.2
## 4 2020 5 -52.2 -28.3 -21.2 -52.2 -37.1 15.9
## 5 2020 6 -19.8 -11.9 35.4 -31.4 -24.7 5.73
## 6 2020 7 -8.58 -7.39 79.3 -21.5 -19.8 0.645
## 7 2020 8 -5.81 -6.19 126 -19.5 -30.3 0.935
## 8 2020 9 -6.73 -3.97 58.9 -16.2 -19.6 1.07
## 9 2020 10 -17.4 -1.74 9.87 -20.1 -17 5.16
## 10 2020 11 -40.4 -12.3 -18.9 -45.1 -28.8 13.6
## # … with 23 more rows
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_retail
title<-paste("average retail and recreation percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="red")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='red')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_gp
title<-paste("average grocery and pharmacy percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="blue")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='blue')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-50, 40), breaks = seq(-50, 40, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_parks
title<-paste("average parks percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="green")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='green')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 170), breaks = seq(-90, 170, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_tran_st
title<-paste("average transit stations percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="violet")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='violet')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 0), breaks = seq(-90, 0, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_workplace
title<-paste("average workplaces percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="orange")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='orange')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-80, 0), breaks = seq(-80, 0, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_res
title<-paste("average residential percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="pink")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='pink')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-10, 40), breaks = seq(-10, 40, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
From the data it is possible that all quantities but residential
percenta change are characterized by a drop during the first months and
weeks of the pandemic, obviuously for the lockdown. It is possible to
see a drop alsobetween the last weeks of the 2020 and the first weeks of
the 2021, corresponding to the second wave of covid 19. After this two
periods the quantities are characterized by a general positive slope,
corresponding to the periodafter the pandemic waves. It is interesting
to notice that workplaces and the transit stations have never reached
the pre-pandemic levels while parks and retail/recreation places
overcome the baseline. Probably because those are recreation places.
Also groceries and pharmacies has overcome the baseline levels. Parks
are characterized by a strong increases during warm months. Basically
the values reletated to ‘residential’ have an opposite behaviour. Indeed
they increase during the first two waves periods and they drop after
that. It is interesting to notice that the change with respect to the
baseline is almost always positive, even in the warm periods, meaning
that after covid 19 people tend to spend more time at home.
Here I download the data related to France and repeat the previous analysis.
data_fr<-data[data$country_region_code=="FR",]
data_fr
## # A tibble: 114,373 × 15
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 FR France <NA> <NA> <NA>
## 2 FR France <NA> <NA> <NA>
## 3 FR France <NA> <NA> <NA>
## 4 FR France <NA> <NA> <NA>
## 5 FR France <NA> <NA> <NA>
## 6 FR France <NA> <NA> <NA>
## 7 FR France <NA> <NA> <NA>
## 8 FR France <NA> <NA> <NA>
## 9 FR France <NA> <NA> <NA>
## 10 FR France <NA> <NA> <NA>
## # … with 114,363 more rows, and 10 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
data_fr<-data_fr[is.na(data_fr$country_region_code)==FALSE,]
data_fr['week']<-week(ymd(data_fr$date))
data_fr['month']<-month(ymd(data_fr$date))
data_fr['year']<-year(ymd(data_fr$date))
data_fr
## # A tibble: 107,129 × 18
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 FR France <NA> <NA> <NA>
## 2 FR France <NA> <NA> <NA>
## 3 FR France <NA> <NA> <NA>
## 4 FR France <NA> <NA> <NA>
## 5 FR France <NA> <NA> <NA>
## 6 FR France <NA> <NA> <NA>
## 7 FR France <NA> <NA> <NA>
## 8 FR France <NA> <NA> <NA>
## 9 FR France <NA> <NA> <NA>
## 10 FR France <NA> <NA> <NA>
## # … with 107,119 more rows, and 13 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
data_fr<-data_fr[is.na(data_fr$metro_area)==TRUE,]
data_fr<-data_fr[is.na(data_fr$sub_region_2)==TRUE,]
data_fr<-data_fr[is.na(data_fr$sub_region_1)==TRUE,]
week_df = data_fr %>%
group_by(year,week) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
week_df
## # A tibble: 142 × 8
## # Groups: year, week [142]
## year week avg_retail avg_gp avg_parks avg_tran_st avg_workplace avg_res
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020 7 2.5 1.25 21 2.25 -8 1.25
## 2 2020 8 -0.143 0.429 29.1 1.71 -11 1.86
## 3 2020 9 -3.43 1.57 -0.143 -1.71 -7.86 2.43
## 4 2020 10 -4.29 1.43 -0.857 -3.57 -2.14 2
## 5 2020 11 -27.4 11.3 -2.14 -22.4 -15.4 7.14
## 6 2020 12 -83.7 -47.4 -66 -80.6 -67 29.3
## 7 2020 13 -85.9 -50.4 -69.6 -83.6 -69 30.3
## 8 2020 14 -86.3 -41.7 -65.4 -81.1 -67.1 29.3
## 9 2020 15 -84.1 -43.1 -66.4 -81.1 -68 29.6
## 10 2020 16 -81.9 -40.6 -64.1 -78.4 -63.1 28
## # … with 132 more rows
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_retail
title<-paste("average retail and recreation percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="red")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='red')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_gp
title<-paste("average grocery and pharmacy percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="blue")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='blue')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-60, 40), breaks = seq(-60, 40, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_parks
title<-paste("average parks percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="green")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='green')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-100, 200), breaks = seq(-100, 200, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_tran_st
title<-paste("average transit stations percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="violet")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='violet')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 row(s) containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_workplace
title<-paste("average workplaces percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="orange")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='orange')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-80, 0), breaks = seq(-80, 0, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
for (i in 2020:2022) {
df_year<-week_df[week_df$year==i,]
x = df_year$week
y = df_year$avg_res
title<-paste("average residential percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="pink")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='pink')+
xlab("week")+
ylab("")+
scale_y_continuous(limits = c(-10, 40), breaks = seq(-10, 40, by=10))+
scale_x_continuous(limits = c(0, 52), breaks = seq(0, 52, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Removed 1 rows containing missing values (geom_point).
month_df = data_it %>%
group_by(year,month) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
month_df
## # A tibble: 33 × 8
## # Groups: year, month [33]
## year month avg_retail avg_gp avg_parks avg_tran_st avg_workplace avg_res
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020 2 -3 2.6 15.5 -4 -4.53 1.6
## 2 2020 3 -61.3 -31.9 -55.5 -62.3 -46.5 21.7
## 3 2020 4 -83.4 -47.6 -77.7 -78.3 -63.9 29.2
## 4 2020 5 -52.2 -28.3 -21.2 -52.2 -37.1 15.9
## 5 2020 6 -19.8 -11.9 35.4 -31.4 -24.7 5.73
## 6 2020 7 -8.58 -7.39 79.3 -21.5 -19.8 0.645
## 7 2020 8 -5.81 -6.19 126 -19.5 -30.3 0.935
## 8 2020 9 -6.73 -3.97 58.9 -16.2 -19.6 1.07
## 9 2020 10 -17.4 -1.74 9.87 -20.1 -17 5.16
## 10 2020 11 -40.4 -12.3 -18.9 -45.1 -28.8 13.6
## # … with 23 more rows
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_retail
title<-paste("average retail and recreation percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="red")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='red')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 10), breaks = seq(-90, 10, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_gp
title<-paste("average grocery and pharmacy percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="blue")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='blue')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-50, 40), breaks = seq(-50, 40, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_parks
title<-paste("average parks percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="green")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='green')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 170), breaks = seq(-90, 170, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_tran_st
title<-paste("average transit stations percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="violet")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='violet')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-90, 0), breaks = seq(-90, 0, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_workplace
title<-paste("average workplaces percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="orange")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='orange')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-80, 0), breaks = seq(-80, 0, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
for (i in 2020:2022) {
df_year<-month_df[month_df$year==i,]
x = df_year$month
y = df_year$avg_res
title<-paste("average residential percent change from baseline",as.character(i))
plt<-ggplot(df_year,aes(x=x,y=y,group=1))+
geom_line(colour="pink")+
geom_point(stat="identity",colour="black",pch=21, size=1.5, fill='pink')+
xlab("month")+
ylab("")+
scale_y_continuous(limits = c(-10, 40), breaks = seq(-10, 40, by=10))+
scale_x_continuous(limits = c(0, 12), breaks = seq(0, 12, by=1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))+
ggtitle(title)
print(plt)
}
Basically the trends followed by the data related to France are very similar to the ones related to Italy. Probably meaning that the habits of the two populations and the anti covid measures of the two countries are similar.
Here I download the data related to Italy in order to analyze the data of the Regions
data_it<-data[data$country_region_code=="IT",]
data_it
## # A tibble: 130,939 × 15
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 IT Italy <NA> <NA> <NA>
## 2 IT Italy <NA> <NA> <NA>
## 3 IT Italy <NA> <NA> <NA>
## 4 IT Italy <NA> <NA> <NA>
## 5 IT Italy <NA> <NA> <NA>
## 6 IT Italy <NA> <NA> <NA>
## 7 IT Italy <NA> <NA> <NA>
## 8 IT Italy <NA> <NA> <NA>
## 9 IT Italy <NA> <NA> <NA>
## 10 IT Italy <NA> <NA> <NA>
## # … with 130,929 more rows, and 10 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
data_it<-data_it[is.na(data_it$country_region_code)==FALSE,]
data_it['week']<-week(ymd(data_it$date))
data_it['month']<-month(ymd(data_it$date))
data_it['year']<-year(ymd(data_it$date))
data_it
## # A tibble: 123,695 × 18
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 IT Italy <NA> <NA> <NA>
## 2 IT Italy <NA> <NA> <NA>
## 3 IT Italy <NA> <NA> <NA>
## 4 IT Italy <NA> <NA> <NA>
## 5 IT Italy <NA> <NA> <NA>
## 6 IT Italy <NA> <NA> <NA>
## 7 IT Italy <NA> <NA> <NA>
## 8 IT Italy <NA> <NA> <NA>
## 9 IT Italy <NA> <NA> <NA>
## 10 IT Italy <NA> <NA> <NA>
## # … with 123,685 more rows, and 13 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
Here I select the data related only to regions.
data_it<-data_it[is.na(data_it$sub_region_1)==FALSE,]
data_it<-data_it[is.na(data_it$sub_region_2)==TRUE,]
data_it<-data_it[is.na(data_it$metro_area)==TRUE,]
data_it
## # A tibble: 19,480 × 18
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 IT Italy Abruzzo <NA> <NA>
## 2 IT Italy Abruzzo <NA> <NA>
## 3 IT Italy Abruzzo <NA> <NA>
## 4 IT Italy Abruzzo <NA> <NA>
## 5 IT Italy Abruzzo <NA> <NA>
## 6 IT Italy Abruzzo <NA> <NA>
## 7 IT Italy Abruzzo <NA> <NA>
## 8 IT Italy Abruzzo <NA> <NA>
## 9 IT Italy Abruzzo <NA> <NA>
## 10 IT Italy Abruzzo <NA> <NA>
## # … with 19,470 more rows, and 13 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
Here I group by the quantities by the regions calculating their averages
region_df = data_it %>%
group_by(sub_region_1) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
region_df
## # A tibble: 20 × 7
## # Groups: sub_region_1 [20]
## sub_region_1 avg_retail avg_gp avg_parks avg_tran_st avg_workplace avg_res
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abruzzo -13.3 7.50 NA -0.0832 -20.5 5.35
## 2 Aosta NA NA NA NA NA NA
## 3 Apulia -11.3 9.36 62.6 -5.61 -20.7 4.33
## 4 Basilicata -9.15 NA NA NA -19.9 5.31
## 5 Calabria -5.27 18.1 NA 15.1 -22.4 5.56
## 6 Campania -17.8 8.89 16.2 -19.5 -23.8 5.83
## 7 Emilia-Romagna -18.3 1.56 30.2 -25.3 -18.8 6.02
## 8 Friuli-Venezia… -19.6 -0.855 NA -17.7 -21.3 6.68
## 9 Lazio -21.4 -1.10 0.494 -34.4 -24.8 6.64
## 10 Liguria -14.1 5.25 53.0 -14.7 -20.5 5.59
## 11 Lombardy -26.3 -4.08 17.2 -34.7 -25.6 7.84
## 12 Marche -14.1 3.77 NA -14.1 -17.8 5.73
## 13 Molise NA NA NA NA NA NA
## 14 Piedmont -24.3 -3.34 15.2 -31.1 -24.0 6.92
## 15 Sardinia -8.54 18.9 NA -4.91 -16.7 5.12
## 16 Sicily -14.4 6.03 46.5 -10.0 -21.9 4.78
## 17 Trentino-South… -19.5 6.32 NA -8.79 -22.3 5.84
## 18 Tuscany -17.8 1.09 50.3 -22.3 -18.7 6.11
## 19 Umbria -17.3 1.02 NA -11.8 -18.7 5.74
## 20 Veneto -17.9 -1.39 40.0 -24.2 -20.1 6.19
Here I plot the results. The empty regions corresponds to NA values.
ggplot(region_df,aes(x=avg_retail, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='red') +
geom_text(aes(label=round(avg_retail,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average retail and recreation percent change from baseline")
## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).
Here it is possible to see that almost all the regions have a decrease
of the quantity “retail and recreation” with a maximum in Lombardy, the
region which suffered the most from covid 19.
ggplot(region_df,aes(x=avg_gp, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='blue') +
geom_text(aes(label=round(avg_gp,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average grocery and pharmacy percent change from baseline")
## Warning: Removed 3 rows containing missing values (position_stack).
## Warning: Removed 3 rows containing missing values (geom_text).
From “grocery and pharmacy” there is not a particular behaviour
ggplot(region_df,aes(x=avg_parks, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='green') +
geom_text(aes(label=round(avg_parks,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average parks percent change from baseline")
## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 10 rows containing missing values (geom_text).
Here it is possible to see how there is a general increase of the
visitors in the parks.
ggplot(region_df,aes(x=avg_tran_st, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='violet') +
geom_text(aes(label=round(avg_tran_st,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average transit stations percent change from baseline")
## Warning: Removed 3 rows containing missing values (position_stack).
## Warning: Removed 3 rows containing missing values (geom_text).
ggplot(region_df,aes(x=avg_workplace, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='orange') +
geom_text(aes(label=round(avg_workplace,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average workplace percent change from baseline")
## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).
The behaviour followed by “workplace” and “transit station” data is similar to the one of “retail and recreation” data.
ggplot(region_df,aes(x=avg_res, y=sub_region_1))+
geom_bar(stat='identity',col='navy',fill='pink') +
geom_text(aes(label=round(avg_res,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Region")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average residential percent change from baseline")
## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).
Obviously the time spent by people in their house increases for all the regions.
In order to have a global view, I repeat the analysis done for the Italy regions using the data from the countries of the world
data_world<-data
data_world<-data_world[is.na(data_world$country_region_code)==FALSE,]
data_world<-data_world[is.na(data_world$sub_region_1)==TRUE,]
data_world<-data_world[is.na(data_world$sub_region_2)==TRUE,]
data_world<-data_world[is.na(data_world$metro_area)==TRUE,]
data_world
## # A tibble: 129,850 × 15
## country_region_code country_region sub_region_1 sub_region_2 metro_area
## <chr> <chr> <chr> <chr> <chr>
## 1 AE United Arab Emirates <NA> <NA> <NA>
## 2 AE United Arab Emirates <NA> <NA> <NA>
## 3 AE United Arab Emirates <NA> <NA> <NA>
## 4 AE United Arab Emirates <NA> <NA> <NA>
## 5 AE United Arab Emirates <NA> <NA> <NA>
## 6 AE United Arab Emirates <NA> <NA> <NA>
## 7 AE United Arab Emirates <NA> <NA> <NA>
## 8 AE United Arab Emirates <NA> <NA> <NA>
## 9 AE United Arab Emirates <NA> <NA> <NA>
## 10 AE United Arab Emirates <NA> <NA> <NA>
## # … with 129,840 more rows, and 10 more variables: iso_3166_2_code <chr>,
## # census_fips_code <chr>, place_id <chr>, date <date>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>, …
states_df = data_world %>%
group_by(country_region) %>%
summarise(avg_retail=mean(retail_and_recreation_percent_change_from_baseline),
avg_gp=mean(grocery_and_pharmacy_percent_change_from_baseline),
avg_parks=mean(parks_percent_change_from_baseline),
avg_tran_st=mean(transit_stations_percent_change_from_baseline),
avg_workplace=mean(workplaces_percent_change_from_baseline),
avg_res=mean(residential_percent_change_from_baseline),
.groups="keep")
ggplot(states_df,aes(x=avg_retail, y=country_region))+
geom_bar(stat='identity',col='navy',fill='red') +
geom_text(aes(label=round(avg_retail,2), size=0.5), hjust=0, size=3, color='black')+
ylab("country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average retail and recreation percent change from baseline")
## Warning: Removed 21 rows containing missing values (position_stack).
## Warning: Removed 21 rows containing missing values (geom_text).
The decline of the time spent in resturant or recreation trend is a trend respected by the majority of the countries. There is only one outlier: the Lybia.
ggplot(states_df,aes(x=avg_gp, y=country_region))+
geom_bar(stat='identity',col='navy',fill='blue') +
geom_text(aes(label=round(avg_gp,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average grocery and pharmacy percent change from baseline")
## Warning: Removed 23 rows containing missing values (position_stack).
## Warning: Removed 23 rows containing missing values (geom_text).
ggplot(states_df,aes(x=avg_parks, y=country_region))+
geom_bar(stat='identity',col='navy',fill='green') +
geom_text(aes(label=round(avg_parks,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average parks percent change from baseline")
## Warning: Removed 31 rows containing missing values (position_stack).
## Warning: Removed 31 rows containing missing values (geom_text).
ggplot(states_df,aes(x=avg_tran_st, y=country_region))+
geom_bar(stat='identity',col='navy',fill='violet') +
geom_text(aes(label=round(avg_tran_st,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average transit stations percent change from baseline")
## Warning: Removed 25 rows containing missing values (position_stack).
## Warning: Removed 25 rows containing missing values (geom_text).
ggplot(states_df,aes(x=avg_workplace, y=country_region))+
geom_bar(stat='identity',col='navy',fill='orange') +
geom_text(aes(label=round(avg_workplace,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average workplace percent change from baseline")
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing missing values (geom_text).
Globally the time spent at worplaces is below the level of the baseline.
ggplot(states_df,aes(x=avg_res, y=country_region))+
geom_bar(stat='identity',col='navy',fill='pink') +
geom_text(aes(label=round(avg_res,2), size=0.5), hjust=0, size=3, color='black')+
ylab("Country")+
xlab('')+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=11))+
ggtitle("average workplace percent change from baseline")
## Warning: Removed 11 rows containing missing values (position_stack).
## Warning: Removed 11 rows containing missing values (geom_text).
From the plot it is possible to see how the growth of the time spent at home is almost a global behaviour
#Exercise 2
x<-readline(prompt = "insert an integer number with at least 10 digits:")
## insert an integer number with at least 10 digits:
x<-as.numeric(x)
n<-readline(prompt = "how many numbers do you want to generate?")
## how many numbers do you want to generate?
n<-as.numeric(n)
generated_numbers<-c()
for (i in 1:n) {
a<-unlist(strsplit(as.character(x),""))
x_squared<-x^2
number <- unlist(strsplit(as.character(x_squared),""))
mid_el<-round(length(number)/2)
shift_1<-round(length(a)/2)
shift_2<-shift_1-1
number_after_trimming<-number[(mid_el-shift_1):(mid_el+shift_2)]
x<-as.numeric(paste(number_after_trimming, collapse=""))
generated_numbers<-append(generated_numbers,x)
}
## Error in 1:n: Argomento NA/NaN
generated_numbers
## NULL
#Exercise 3
As likelihood I use a binomial distribution with n = 150 and p = 29/150, because there are two possible states: a person read or didn’t read the journal.
x<-seq(0, 1, length.out = 151)
prior<-dunif(x,0,1)
r<-seq(0, 150, length.out = 151)
n<-150
p<-29/150
like<-dbinom(r,n,p)
now I calculate the unnormalized posterior with r = 29.
r<-29
n<-150
n_sample <- 150
p<-seq(from=1/n_sample, by=1/n_sample, length.out = n_sample)
unnorm_posterior<-dbinom(x=r, size=n, prob=p)
And calculate the normalization
Z<-sum(unnorm_posterior/n_sample)
Z
## [1] 0.006622517
The normalized posterior is:
posterior<-unnorm_posterior/Z
Here I plot the posterior
plot<-ggplot(data.frame(x=p,y=posterior),aes(x,y))+geom_line(color="red")+
geom_line(colour="red",size=1)+
xlab("p")+
ylab("")+
ggtitle("Posterior")+
scale_y_continuous(limits = c(0, 13), breaks = seq(0, 13, by=1))+
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by=0.1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
I plot the not normalized posterior
plot<-ggplot(data.frame(x=p,y=unnorm_posterior),aes(x,y))+
geom_line(colour="orange",size=1)+
xlab("p")+
ylab("")+
ggtitle("Not Notmalized Posterior")+
scale_y_continuous(limits = c(0, 0.09), breaks = seq(0, 0.09, by=0.01))+
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by=0.1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
Here I plot the likelihood
r<-0:150
plot<-ggplot(data.frame(x=r,y=like),aes(x,y))+geom_line(color="red")+
geom_line(colour="blue",size=1)+
xlab("r")+
ylab("")+
ggtitle("Likelihood")+
scale_y_continuous(limits = c(0, 0.09), breaks = seq(0, 0.09, by=0.01))+
scale_x_continuous(limits = c(0, 150), breaks = seq(0, 150, by=10))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
In order to check the results I calculate the not normalized posterior multiplying the likelihood with posterior
multi_posterior<-like*prior
p<-seq(0, 1, length.out = 151)
plot<-ggplot(data.frame(x=p,y=multi_posterior),aes(x,y))+
geom_line(colour="orange",size=1)+
xlab("p")+
ylab("")+
ggtitle("Not Notmalized Posterior")+
scale_y_continuous(limits = c(0, 0.09), breaks = seq(0, 0.09, by=0.01))+
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by=0.1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
The result is the same so the process is correct.
#Exercise 4
At first I define the posterior assuming n = 30 and r = 15. Wen the Beta distribution is used I set a = 3 and b = 9. When the functions are defined I calculate the Z in both case and than define again the posteriors normalizing them.
#prior_unif<-dunif(x,0,1)
#prior_beta<-dbeta(x,1,1)
n<-30
r<-15
n_sample<-30
a<-3
b<-9
p<-seq(from=1/n_sample, by=1/n_sample, length.out = n_sample)
unnorm_posterior_unif<-function(x,r,n){
out<-x^(r)*(1-x)^(n-r)
}
Z_unif<-sum(unnorm_posterior_unif(p,r=r,n=n)/n_sample)
posterior_unif<-function(x,r,n){
out<-x^(r)*(1-x)^(n-r)/Z_unif
}
unnorm_posterior_beta<-function(x,r,n,a,b){
out<-x^(r+a-1)*(1-x)^(n-r+b-1)
}
Z_beta<-sum(unnorm_posterior_beta(p,r=r,n=n,a=a,b=b))/n_sample
posterior_beta<-function(x,r,n,a,b){
out<-x^(r+a-1)*(1-x)^(n-r+b-1)/Z_beta
}
Now i define the likelihood, that is a binomial distribution with p = 15/30 and n = 30
x<-1:30
like<-dbinom(x,size=30,p=15/30)
Here I plot the likelihood
plot<-ggplot(data.frame(x=x,y=like),aes(x,y))+
geom_line(colour="green",size=1)+
xlab("p")+
ylab("")+
ggtitle("Likelihood( n=30,r=15)")+
scale_y_continuous(limits = c(0,0.2), breaks = seq(0, 0.2, by=0.01))+
scale_x_continuous(limits = c(0, 30), breaks = seq(0, 30, by=5))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
Here I plot the posteriors
plot<-ggplot(data.frame(x=p,y=posterior_unif(p,r=r,n=n)),aes(x,y))+
geom_line(colour="red",size=1)+
xlab("p")+
ylab("")+
ggtitle("Posterior(uniform prior, n=30,r=15)")+
scale_y_continuous(limits = c(0,6), breaks = seq(0, 6, by=1))+
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by=0.1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
plot<-ggplot(data.frame(x=p,y=posterior_beta(p,r=r,n=n,a=a,b=b)),aes(x,y))+
geom_line(colour="blue",size=1)+
xlab("p")+
ylab("")+
ggtitle("Posterior(uniform prior, n=30,r=15)")+
scale_y_continuous(limits = c(0,6), breaks = seq(0, 6, by=1))+
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by=0.1))+
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
plot
Here I find the most probable values of p. First for Beta prior:
MAX_beta<-max(posterior_beta(p,r=r,n=n,a=a,b=b))
index_max_beta<-which(posterior_beta(p,r=r,n=n,a=a,b=b)==MAX_beta)
p_beta_max<-p[index_max_beta]
cat("the most probable value of p(when the prior is a beta distribution B(a=3,b=9)) is", p_beta_max)
## the most probable value of p(when the prior is a beta distribution B(a=3,b=9)) is 0.4333333
Second for a uniform prior:
r = 15
n = 30
MAX_unif<-max(posterior_unif(p,r=r,n=n))
index_max_unif<-which(posterior_unif(p,r=r,n=n)==MAX_unif)
p_unif_max<-p[index_max_unif]
cat("the most probable value of p(when the prior is a uniform distribution) is", p_unif_max)
## the most probable value of p(when the prior is a uniform distribution) is 0.5
Here I find the limits of the interval for 95% credibility interval. First for uniform prior:
q = 0.025
lim_inf<-qbeta(q,r+1,n-r+1)
lim_sup<-qbeta(1-q,r+1,n-r+1)
cat("the inteval for 95% credibility interval is [", lim_inf,",",lim_sup,"]")
## the inteval for 95% credibility interval is [ 0.330606 , 0.669394 ]
cat("\n")
cat("the probability corresponding to the two extremes is ",integrate(posterior_unif,lim_inf,lim_sup,r=r,n=n)$value)
## the probability corresponding to the two extremes is 0.95
lim_inf<-qbeta(q,r+a+1,n-r+b+1)
lim_sup<-qbeta(1-q,r+a+1,n-r+b+1)
cat("the inteval for 95% credibility interval is [", lim_inf,",",lim_sup,"]")
## the inteval for 95% credibility interval is [ 0.2907812 , 0.5787304 ]
cat("\n")
cat("the probability corresponding to the two extremes is ",integrate(posterior_beta, lim_inf,lim_sup,r=r,n=n,a=a,b=b)$value)
## the probability corresponding to the two extremes is 0.9441157
Here I see how the pevious quantities vary analyzing the data sequentially.
head<-c(1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0,0)
p_b_max_vec<-c()
p_u_max_vec<-c()
lim_inf_b_vec<-c()
lim_sup_b_vec<-c()
lim_inf_u_vec<-c()
lim_sup_u_vec<-c()
p<-seq(0, 1, length.out = 201)
for (i in 1:length(head)) {
print(i)
samples<-head[1:i]
r<-sum(samples)
n<-length(samples)
q=0.025
MAX_beta<-max(posterior_beta(p,r=r,n=n,a=a,b=b))
index_max_beta<-which(posterior_beta(p,r=r,n=n,a=a,b=b)==MAX_beta)
p_b_max<-p[index_max_beta]
MAX_unif<-max(posterior_unif(p,r=r,n=n))
index_max_unif<-which(posterior_unif(p,r=r,n=n)==MAX_unif)
p_u_max<-p[index_max_unif]
lim_inf_u<-qbeta(q,r+1,n-r+1)
lim_sup_u<-qbeta(1-q,r+1,n-r+1)
lim_inf_b<-qbeta(q,r+a+1,n-r+b+1)
lim_sup_b<-qbeta(1-q,r+a+1,n-r+b+1)
p_b_max_vec<-append(p_b_max_vec,p_b_max)
p_u_max_vec<-append(p_u_max_vec,p_u_max)
lim_inf_b_vec<-append(lim_inf_b_vec,lim_inf_b)
lim_sup_b_vec<-append(lim_sup_b_vec,lim_sup_b)
lim_inf_u_vec<-append(lim_inf_u_vec,lim_inf_u)
lim_sup_u_vec<-append(lim_sup_u_vec,lim_sup_u)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
plot<-ggplot(data.frame(x=1:30,inf=lim_inf_u_vec,sup=lim_sup_u_vec,max=p_u_max_vec))+
geom_line(aes(x, inf, colour='inferior limit'))+
geom_line(aes(x, sup, colour='superior limit'))+
geom_line(aes(x, max, colour='most probable p'))+
xlab("n")+
ylab("")+
ggtitle("Posterior(uniform prior, n=30,r=15)")+
scale_y_continuous(limits = c(0,1), breaks = seq(0, 1, by=0.1))+
scale_x_continuous(limits = c(0, 30), breaks = seq(0, 30, by=1))+
scale_color_manual(values = c('inferior limit' = 'red','superior limit' = 'blue', 'most probable p' = 'green'))
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
## List of 3
## $ title :List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 12
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 10
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y:List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 10
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
plot
cat('the most probable p at n=30 is ',p_u_max_vec[30])
## the most probable p at n=30 is 0.5
cat('\n')
cat('the superior limit of the confidentiality interval at n=30 is ',lim_sup_u_vec[30])
## the superior limit of the confidentiality interval at n=30 is 0.669394
cat('\n')
cat('the inferior limit of the confidentiality interval at n=30 is ',lim_inf_u_vec[30])
## the inferior limit of the confidentiality interval at n=30 is 0.330606
cat('\n')
plot<-ggplot(data.frame(x=1:30,inf=lim_inf_b_vec,sup=lim_sup_b_vec,max=p_b_max_vec))+
geom_line(aes(x, inf, colour='inferior limit'))+
geom_line(aes(x, sup, colour='superior limit'))+
geom_line(aes(x, max, colour='most probable p'))+
xlab("n")+
ylab("")+
ggtitle("Posterior(uniform prior, n=30,r=15)")+
scale_y_continuous(limits = c(0,1), breaks = seq(0, 1, by=0.1))+
scale_x_continuous(limits = c(0, 30), breaks = seq(0, 30, by=1))+
scale_color_manual(values = c('inferior limit' = 'red','superior limit' = 'blue', 'most probable p' = 'green'))
theme(axis.text.x = element_text(face="bold.italic", size=10), axis.text.y =
element_text(face="bold.italic", size=10), title =
element_text(face="bold.italic", size=12))
## List of 3
## $ title :List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 12
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 10
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y:List of 11
## ..$ family : NULL
## ..$ face : chr "bold.italic"
## ..$ colour : NULL
## ..$ size : num 10
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
plot
cat('the most probable p at n=30 is ',p_b_max_vec[30])
## the most probable p at n=30 is 0.425
cat('\n')
cat('the superior limit of the confidentiality interval at n=30 is ',lim_sup_b_vec[30])
## the superior limit of the confidentiality interval at n=30 is 0.5787304
cat('\n')
cat('the inferior limit of the confidentiality interval at n=30 is ',lim_inf_b_vec[30])
## the inferior limit of the confidentiality interval at n=30 is 0.2907812
cat('\n')
The final values obtained are the same in both the data analysis methods.